# Importing necessary libraries for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Importing the dataset with 'unicode_escape' encoding
flip_kart_df = pd.read_csv(r"C:\Users\jki\Downloads\flipkart_sales_data.csv", encoding='unicode_escape')
flip_kart_df.head(5)


flip_kart_df.shape

(11251, 15)


flip_kart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   User_ID           11251 non-null  int64  
 1   Cust_name         11251 non-null  object 
 2   Product_ID        11251 non-null  object 
 3   Gender            11251 non-null  object 
 4   Age Group         11251 non-null  object 
 5   Age               11251 non-null  int64  
 6   Marital_Status    11251 non-null  int64  
 7   State             11251 non-null  object 
 8   Zone              11251 non-null  object 
 9   Occupation        11251 non-null  object 
 10  Product_Category  11251 non-null  object 
 11  Orders            11251 non-null  int64  
 12  Amount            11239 non-null  float64
 13  Status            0 non-null      float64
 14  unnamed1          0 non-null      float64
dtypes: float64(3), int64(4), object(8)
memory usage: 1.3+ MB


# Dropping columns 'Status' and 'unnamed1' from the DataFrame
# 'Status' column might not be relevant for the analysis, while 'unnamed1' seems to be an unnamed or redundant column
flip_kart_df.drop(['Status', 'unnamed1'], axis=1, inplace=True)


# Checking for null values in the DataFrame and summing them up
pd.isnull(flip_kart_df).sum()

User_ID              0
Cust_name            0
Product_ID           0
Gender               0
Age Group            0
Age                  0
Marital_Status       0
State                0
Zone                 0
Occupation           0
Product_Category     0
Orders               0
Amount              12
dtype: int64


# Dropping rows with null values from the DataFrame
flip_kart_df.dropna(inplace=True)


# Changing the data type of the 'Amount' column to integer
flip_kart_df['Amount'] = flip_kart_df['Amount'].astype('int')


flip_kart_df['Amount'].dtypes

dtype('int32')


flip_kart_df.columns

Index(['User_ID', 'Cust_name', 'Product_ID', 'Gender', 'Age Group', 'Age',
       'Marital_Status', 'State', 'Zone', 'Occupation', 'Product_Category',
       'Orders', 'Amount'],
      dtype='object')


# Renaming the column 'Cust_name' to 'Customer_name'
flip_kart_df.rename(columns={'Cust_name': 'Customer_name'}, inplace=True)


# Checking statistical summary of the DataFrame
flip_kart_df.describe()


# Generating statistical summary for specific columns: 'Age', 'Orders', and 'Amount'
flip_kart_df[['Age', 'Orders', 'Amount']].describe()


flip_kart_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11239 entries, 0 to 11250
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   User_ID           11239 non-null  int64 
 1   Customer_name     11239 non-null  object
 2   Product_ID        11239 non-null  object
 3   Gender            11239 non-null  object
 4   Age Group         11239 non-null  object
 5   Age               11239 non-null  int64 
 6   Marital_Status    11239 non-null  int64 
 7   State             11239 non-null  object
 8   Zone              11239 non-null  object
 9   Occupation        11239 non-null  object
 10  Product_Category  11239 non-null  object
 11  Orders            11239 non-null  int64 
 12  Amount            11239 non-null  int32 
dtypes: int32(1), int64(4), object(8)
memory usage: 1.2+ MB


# Setting up the figure size and background color
plt.figure(figsize=(10, 6), facecolor='#F7A200')

# Setting the title of the plot
plt.title('Gender Count')

# Creating a count plot for gender using seaborn
a = sns.countplot(y='Gender', hue='Gender', data=flip_kart_df, palette='Blues')

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')

# Adding labels to the bars
for bars in a.containers:
    a.bar_label(bars)


# Calculating total sales by gender
sales_gender = flip_kart_df.groupby(['Gender'], as_index=False)['Amount'].sum()

# Generating colors for the pie chart
colors = sns.color_palette('Blues', n_colors=len(sales_gender))

# Setting up the figure size and background color
plt.figure(figsize=(9, 6), facecolor='#F7A200')
plt.title('Total Sales By Gender')

# Creating the pie chart
patches, texts, autotexts = plt.pie(sales_gender['Amount'], labels=sales_gender['Gender'], autopct='', colors=colors, pctdistance=0.7)
plt.axis('equal')

# Adding text labels for sales amounts
female_sales = sales_gender[sales_gender['Gender'] == 'F']['Amount'].values[0]
male_sales = sales_gender[sales_gender['Gender'] == 'M']['Amount'].values[0]
plt.text(-0.4, 0, f"{female_sales}", fontsize=12, ha='center', va='center', color='black')
plt.text(0.4, 0, f"{male_sales}", fontsize=12, ha='center', va='top', color='black')

# Adding legend
plt.legend(title='Gender', loc='upper left', bbox_to_anchor=(1, 0.5))

<matplotlib.legend.Legend at 0x29d9a02ad10>


# Setting up the figure size and background color
plt.figure(figsize=(9, 6), facecolor='#F7A200')
plt.title('Age group wise count')

# Creating a count plot for age groups with gender breakdown using seaborn
a = sns.countplot(data=flip_kart_df, x='Age Group', hue='Gender', palette='Blues')

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')

# Turning off the grid lines
a.grid(False)

# Adding labels to the bars
for bars in a.containers:
    a.bar_label(bars)


# Setting up the figure size and background color
plt.figure(figsize=(9, 6), facecolor='#F7A200')
plt.title('Age Group Wise Total Amount')

# Calculating total amount spent by each age group
sales_age_group = flip_kart_df.groupby(['Age Group'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)

# Creating a bar plot for total amount spent by age group using seaborn
a = sns.barplot(x='Age Group', y='Amount', data=sales_age_group, palette='Blues', errorbar=None)

# Setting the background color of the plot
a.patch.set_facecolor('#f7a200')

# Turning off the grid lines
a.grid(False)


# Setting up the figure size and background color
plt.figure(figsize=(13, 6), facecolor='#F7A200')
plt.title('State-wise Distribution of Orders')

# Calculating total orders for the top 10 states
orders_state = flip_kart_df.groupby(['State'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)

# Creating a line plot for state-wise distribution of orders using seaborn
a = sns.lineplot(data=orders_state, y='Orders', x='State', marker='o', color='blue')

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')

# Turning on the grid lines and setting the color
a.grid(True, color='gray')

# Adding labels to the data points
for index, row in orders_state.iterrows():
    a.text(row['State'], row['Orders'], f'{row["Orders"]}', color='black', ha='left', va='center')

# Setting labels and formatting
plt.xlabel('State')
plt.ylabel('Total Orders')
plt.xticks(rotation=90)
plt.tight_layout()


import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the figure size and background color
plt.figure(figsize=(10, 6), facecolor='#F7A200')

# Setting the title of the plot
plt.title('Total Sales Distribution by States')

# Calculating total sales for the top 10 states
sales_state = flip_kart_df.groupby(['State'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(10)

# Creating a bar plot for total sales distribution by states using seaborn
a = sns.barplot(data=sales_state, x='State', y='Amount', palette='Blues')

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')

# Turning off the grid lines
a.grid(False)

# Rotating x-axis labels for better readability
plt.xticks(rotation=90)

# Adding labels to the bars
for index, row in sales_state.iterrows():
    a.text(index, row['Amount'], str(round(row['Amount'], 2)), color='black', ha="center")

plt.show()


# Calculating sales by customer in each state
sales_by_customer = flip_kart_df.groupby(['State', 'Customer_name'])['Amount'].sum().reset_index()

# Sorting the data to find the top 5 customers by sales in each state
sales_by_customer_sorted = sales_by_customer.groupby('State').apply(lambda x: x.nlargest(5, 'Amount')).reset_index(drop=True)

# Extracting unique states
states = sales_by_customer_sorted['State'].unique()
num_states = len(states)

# Determining the number of rows for subplots
num_rows = (num_states + 2) // 3

# Creating subplots
fig, axes = plt.subplots(num_rows, 3, figsize=(12, 5*num_rows), sharex=True, facecolor='#F7A200')
fig.patch.set_facecolor('#F7A200')

# Defining colors for the bar plots
colors = sns.color_palette('Blues', n_colors=5)

# Iterating through states and plotting
for i, state in enumerate(states):
    row = i // 3
    col = i % 3

    data = sales_by_customer_sorted[sales_by_customer_sorted['State'] == state]

    ax = sns.barplot(x='Amount', y='Customer_name', hue='Customer_name', data=data, ax=axes[row, col], palette=colors, dodge=False)

    ax.set_title(f'Top 5 Customers by Sales in {state}')
    ax.set_xlabel('Total Sales')
    ax.set_ylabel('Customer Name')
    ax.set_facecolor('#F7A200')

    if ax.legend_:
        ax.legend_.remove()

# Removing unused subplots
for i in range(num_states, num_rows*3):
    fig.delaxes(axes.flatten()[i])

plt.tight_layout()


# Setting up the figure size and background color
plt.figure(figsize=(8, 8), facecolor='#F7A200')

# Setting the title of the plot
plt.title('Marital Status Distribution Among Customers')

# Counting the occurrences of each marital status
marital_counts = flip_kart_df['Marital_Status'].value_counts()

# Renaming index labels for better readability
marital_counts.index = ['Unmarried', 'Married']

# Generating colors for the pie chart
colors = plt.cm.Blues(np.linspace(0.2, 1, len(marital_counts)))

# Creating a pie chart for marital status distribution
pie = plt.pie(marital_counts, labels=[f'{label} ({count})' for label, count in marital_counts.items()], colors=colors, startangle=90)

# Adding legend and setting its title and labels
plt.legend(title='Marital Status', labels=['Unmarried', 'Married'], loc='upper right')

# Setting the limits of the plot
plt.axis((-1.1, 1.1, -1.1, 1.1))

(-1.1, 1.1, -1.1, 1.1)


# Setting up the figure size and background color
plt.figure(figsize=(10,6), facecolor='#F7A200')

# Setting the title of the plot
plt.title('Total Sales Distribution by Marital Status')

# Calculating total sales by marital status and gender
sales_marital = flip_kart_df.groupby(['Marital_Status', 'Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)

# Creating a bar plot for sales distribution by marital status using seaborn
a = sns.barplot(data=sales_marital, x='Marital_Status', y='Amount', hue='Gender', palette=('Blues'))

# Turning off the grid lines
a.grid(False)

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')


import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the figure size and background color
plt.figure(figsize=(10, 6), facecolor='#F7A200')

# Setting the title of the plot
plt.title('Distribution of Customers Across Occupations')

# Creating a count plot for the distribution of customers across occupations using seaborn
a = sns.countplot(data=flip_kart_df, y='Occupation', hue='Occupation', palette='Blues', order=flip_kart_df['Occupation'].value_counts().index)

# Turning off the grid lines
a.grid(False)

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')

# Adding labels to the bars
for bars in a.containers:
    a.bar_label(bars)

# Adding legend
plt.legend(title='Occupation', loc='upper right')

plt.show()


# Setting up the figure size and background color
plt.figure(figsize=(10,6), facecolor='#f7a200')

# Setting the title of the plot
plt.title('Distribution of Sales Across Occupations')

# Calculating total sales by occupation
sales_occupation = flip_kart_df.groupby(['Occupation'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)

# Creating a bar plot for sales distribution by occupation using seaborn
a = sns.barplot(data=sales_occupation, x='Occupation', y='Amount', palette='Blues', hue='Occupation', dodge=False)

# Turning off the grid lines
a.grid(False)

# Setting the background color of the plot
a.patch.set_facecolor('#f7a200')

# Rotating x-axis labels for better readability
plt.xticks(rotation=90)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 [Text(0, 0, 'IT Sector'),
  Text(1, 0, 'Healthcare'),
  Text(2, 0, 'Aviation'),
  Text(3, 0, 'Banking'),
  Text(4, 0, 'Govt'),
  Text(5, 0, 'Hospitality'),
  Text(6, 0, 'Media'),
  Text(7, 0, 'Automobile'),
  Text(8, 0, 'Chemical'),
  Text(9, 0, 'Lawyer'),
  Text(10, 0, 'Retail'),
  Text(11, 0, 'Food Processing'),
  Text(12, 0, 'Construction'),
  Text(13, 0, 'Textile'),
  Text(14, 0, 'Agriculture')])


import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the figure size and background color
plt.figure(figsize=(10, 6), facecolor='#F7A200')

# Setting the title of the plot
plt.title('Distribution of Products by Category')

# Creating a count plot for the distribution of products by category using seaborn
a = sns.countplot(data=flip_kart_df, y='Product_Category', hue='Product_Category', palette='Blues', order=flip_kart_df['Product_Category'].value_counts().index)

# Turning off the grid lines
a.grid(False)

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')

# Adding labels to the bars
for bars in a.containers:
    a.bar_label(bars)

# Adding legend
plt.legend(title='Product Category', loc='upper right')

plt.show()


import matplotlib.pyplot as plt
import seaborn as sns

# Calculating the top 10 product categories by total sales
sales_product = flip_kart_df.groupby(['Product_Category'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(10)

# Calculating the number of unique product categories
num_categories = len(sales_product['Product_Category'].unique())

# Generating a color palette for the plot
palette = sns.color_palette("Blues", n_colors=num_categories)

# Setting up the figure size and background color
plt.figure(figsize=(10,6), facecolor='#F7A200')
plt.title('Distribution of Sales by Product')

# Creating a bar plot for the distribution of sales by product using seaborn
a = sns.barplot(data=sales_product, x='Product_Category', y='Amount', hue='Product_Category', palette='Blues')

# Turning off the grid lines
a.grid(False)

# Setting the background color of the plot
a.patch.set_facecolor('#F7A200')

# Rotating x-axis labels for better readability
plt.xticks(rotation=90)

# Adding legend
plt.legend(title='Product Category', loc='upper right')

plt.show()


import matplotlib.pyplot as plt
import seaborn as sns

# Calculating total sales by product category in each state
sales_by_product = flip_kart_df.groupby(['State', 'Product_Category'])['Amount'].sum().reset_index()

# Sorting the data to find the top 5 product categories by sales in each state
sales_by_product_sorted = sales_by_product.groupby('State').apply(lambda x: x.nlargest(5, 'Amount')).reset_index(drop=True)

# Extracting unique states and calculating the number of rows needed for subplots
states = sales_by_product_sorted['State'].unique()
num_states = len(states)
num_rows = (num_states + 2) // 3

# Creating subplots for visualization
fig, axes = plt.subplots(num_rows, 3, figsize=(12, 5*num_rows), sharex=True, facecolor='yellow')
fig.patch.set_facecolor('#F7A200')

# Iterating over states to plot top 5 product categories by sales
for i, state in enumerate(states):
    row = i // 3
    col = i % 3
    data = sales_by_product_sorted[sales_by_product_sorted['State'] == state]

    ax = sns.barplot(x='Amount', y='Product_Category', data=data, ax=axes[row, col], palette='Blues', hue='Product_Category')

    ax.set_title(f'Top 5 categories by Sales in {state}')
    ax.set_xlabel('Total Sales')
    ax.set_ylabel('Product_Category')
    ax.set_facecolor('#F7A200')

    # Removing legend for each subplot
    ax.get_legend().remove()

# Removing excess subplots if necessary
for i in range(num_states, num_rows*3):
    fig.delaxes(axes.flatten()[i])

plt.tight_layout()
plt.legend(title='Product Category', loc='upper right')
plt.show()


# Calculating gender distribution across different states
gender_distribution = flip_kart_df.groupby(['State', 'Gender']).size().reset_index(name='Count')

# Setting up the figure size and background color
plt.figure(figsize=(12, 8), facecolor='#f7a200')
plt.title('Gender Distribution Across Different States')

# Creating a bar plot for gender distribution across states using seaborn
ax = sns.barplot(data=gender_distribution, x='State', y='Count', hue='Gender', palette='Blues')

# Setting labels and formatting
plt.xlabel('State')
plt.ylabel('Number of Customers')
plt.grid(False)
plt.gca().patch.set_facecolor('#f7a200')
plt.legend(title='Gender', loc='upper right')
plt.xticks(rotation=90)

# Adding labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.0f}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center',
                va='baseline',
                xytext=(0, 5),
                textcoords='offset points')


# Calculating gender distribution by product category
gender_by_category = flip_kart_df.groupby(['Product_Category', 'Gender']).size().reset_index(name='Count')

# Setting up the figure size and background color
plt.figure(figsize=(12, 8), facecolor='#f7a200')
plt.title('Gender Distribution by Product Category')

# Creating a color palette for the genders
palette = sns.color_palette("Blues", n_colors=len(gender_by_category['Gender'].unique()))

# Creating a bar plot for gender distribution by product category using seaborn
ax = sns.barplot(data=gender_by_category, x='Product_Category', y='Count', hue='Gender', palette=palette)

# Setting labels and formatting
plt.xlabel('Product Category')
plt.ylabel('Number of Customers')
plt.grid(False)
plt.gca().patch.set_facecolor('#f7a200')
plt.legend(title='Gender', loc='upper right')
plt.xticks(rotation=90)

# Adding labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.0f}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center',
                va='baseline',
                xytext=(0, 5),
                textcoords='offset points')

	User_ID	Cust_name	Product_ID	Gender	Age Group	Age	Marital_Status	State	Zone	Occupation	Product_Category	Orders	Amount	Status	unnamed1
0	1002903	Kanishk	P00125942	F	26-35	28	0	Maharashtra	Western	Healthcare	Auto	1	23952.0	NaN	NaN
1	1000732	Aryan	P00110942	F	26-35	35	1	Andhra Pradesh	Southern	Govt	Auto	3	23934.0	NaN	NaN
2	1001990	Raunak	P00118542	F	26-35	35	1	Uttar Pradesh	Central	Automobile	Auto	3	23924.0	NaN	NaN
3	1001425	Suwarna	P00237842	M	0-17	16	0	Karnataka	Southern	Construction	Auto	2	23912.0	NaN	NaN
4	1000588	Pritam	P00057942	M	26-35	28	1	Gujarat	Western	Food Processing	Auto	2	23877.0	NaN	NaN

	User_ID	Age	Marital_Status	Orders	Amount
count	1.123900e+04	11239.000000	11239.000000	11239.000000	11239.000000
mean	1.003004e+06	35.410357	0.420055	2.489634	9453.610553
std	1.716039e+03	12.753866	0.493589	1.114967	5222.355168
min	1.000001e+06	12.000000	0.000000	1.000000	188.000000
25%	1.001492e+06	27.000000	0.000000	2.000000	5443.000000
50%	1.003064e+06	33.000000	0.000000	2.000000	8109.000000
75%	1.004426e+06	43.000000	1.000000	3.000000	12675.000000
max	1.006040e+06	92.000000	1.000000	4.000000	23952.000000

	Age	Orders	Amount
count	11239.000000	11239.000000	11239.000000
mean	35.410357	2.489634	9453.610553
std	12.753866	1.114967	5222.355168
min	12.000000	1.000000	188.000000
25%	27.000000	2.000000	5443.000000
50%	33.000000	2.000000	8109.000000
75%	43.000000	3.000000	12675.000000
max	92.000000	4.000000	23952.000000

Return Home

Flipkart Sales Analysis: Unveiling Customer Insights and Market Trends¶

Introduction¶

Project Overview¶

Objective¶

Key Insights¶

Project Description¶

Import Required Libraries¶

Load the Dataset¶

Data Cleaning: Drop Unrelated or Blank Columns¶

Exploratory Data Analysis¶

Gender Distribution: Bar Chart¶

Total Sales by Gender: Pie Chart¶

Age Group Analysis: Count Variation¶

Correlation Analysis: Age Groups and Total Amount Spent¶

Distribution of Orders Across Top 10 States¶

Distribution of Total Sales Across Top 10 States¶

Top 5 Customers by Sales in Different States¶

Distribution of Marital Status Among Customers¶¶

Sales Distribution by Marital Status¶

Distribution of Customers Across Occupations¶

Distribution of Sales Across Different Occupations¶

Distribution of Product Count by Category¶

Distribution of Sales Across Product Categories¶

Top 5 Product Categories by Sales in Different States¶

Gender Distribution Across Different States¶

Gender Distribution by Product Category¶

CONCLUSION¶

KEY INSIGHTS¶

STRATEGIC IMPLICATIONS¶